Setup

Data preparation

Raster data

Import multiband raster

# Import raster layer
ls <- raster::stack("D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/L8_reducers_2021_months_8-12.tif")

Select bands (predictors)

# # Subset to specific columns
# bands <- c("B_median","G_median","R_median","NIR_median","SWIR1_median","SWIR2_median","BSI_median","NDVI_median","NDMI_median")
# landsat <- subset(ls, bands)

# OR: remove only valid_pixel column
landsat <- ls[[-length(names(ls))]]

names(landsat)
##  [1] "B_median"     "B_mean"       "B_stdDev"     "G_median"     "G_mean"      
##  [6] "G_stdDev"     "R_median"     "R_mean"       "R_stdDev"     "NIR_median"  
## [11] "NIR_mean"     "NIR_stdDev"   "SWIR1_median" "SWIR1_mean"   "SWIR1_stdDev"
## [16] "SWIR2_median" "SWIR2_mean"   "SWIR2_stdDev" "BSI_median"   "BSI_mean"    
## [21] "BSI_stdDev"   "NDVI_median"  "NDVI_mean"    "NDVI_stdDev"  "EVI2_median" 
## [26] "EVI2_mean"    "EVI2_stdDev"  "SAVI_median"  "SAVI_mean"    "SAVI_stdDev" 
## [31] "MSAVI_median" "MSAVI_mean"   "MSAVI_stdDev" "NDMI_median"  "NDMI_mean"   
## [36] "NDMI_stdDev"  "NBR_median"   "NBR_mean"     "NBR_stdDev"   "NBR2_median" 
## [41] "NBR2_mean"    "NBR2_stdDev"  "NDWI_median"  "NDWI_mean"    "NDWI_stdDev"

Training data from training polygons

Import training polygons

# Import dataset
samp <- shapefile("D:/Dateien/Uni/Eagle_Master/Masterthesis/Data/trainingdata/Nuichua/LC_classes_tejas.shp")

# Need to have a numeric id for each class - helps with rasterization later on.
samp@data$class_id <- as.integer(factor(samp@data$class_id))
samp@data$plot_id <- as.integer(factor(samp@data$plot_id))
samp@data$binary_id <- as.integer(factor(samp@data$binary_id))

# Set as data table
setDT(samp@data)

Define class parameters

# set class names
classes <- c("Evergreen.Forest","Dry.Forest","Costal.Shrubland","Inland.Shrubland",
  "Bare.Ground","Agriculture","Water","Built.up")
# define color palette
cpal <- c("#162700","#a1a733","#5fa00a","#37590a","#f86b05","#c22341","#0500ff","#020103")
# save class definitions in df
classdf <- data.frame(classvalue = c(1,2,3,4,5,6,7,8),
                      classnames = classes,
                      color = cpal)

–> Raster and vector data have same CRS? If not, reproject training data.

Plot training data

mapview(samp, zcol = "landcover", layer.name = "Tejas landcover classes",
        col.regions = list("#c22341","#f86b05","#020103","#5fa00a","#a1a733","#162700","#37590a","#0500ff"))

Create training samples in R

Extract all band values

Rasterize training polygons

# Create raster template
template_rst <- raster(extent(landsat$B_median), # band B2 has resolution 10 m
                       resolution = 30,
                       crs = projection(landsat$B_median))
#samp_rst <- raster::rasterize(samp, template_rst, field = "class_id") #-> not working, don't know why :(
samp_rst <- raster("D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/LC_tejas_rasterized.tif") # import sample raster created in QGIS

Create spatial points from cells in training polygons

samp_dt <- as.data.table(rasterToPoints(samp_rst))
setnames(samp_dt, old = "LC_tejas_rasterized", new = "class_id")

points <- SpatialPointsDataFrame(coords = samp_dt[, .(x, y)],
                                 data = samp_dt,
                                 proj4string = samp_rst@crs)

mapview(points,zcol="class_id",layer.name="Point class ids")

Extract bandinfos from landsat scene at points

# dplyr syntax
dt <- terra::rast(landsat) %>%
  terra::extract(y = terra::vect(points)) %>%
  as.data.frame %>%
  mutate(class_id = points@data$class_id) %>% # add the class names to each row
  # left_join(y = unique(poly@data), by = c("class_id" = "id")) %>%
  # mutate(class_id = NULL) %>% # this column is extra now, delete it
  mutate(class_id = factor(class_id))
data.table::setDT(dt)

# Add landcover labels
dt <- dt %>%
  mutate(landcover = case_when(
    class_id == 1 ~ "Evergreen",
    class_id == 2 ~ "Dry Forest",
    class_id == 3 ~ "Coastal Shrubland",
    class_id == 4 ~ "Inland Shrubland",
    class_id == 5 ~ "Bare Ground",
    class_id == 6 ~ "Agriculture",
    class_id == 7 ~ "Water",
    class_id == 8 ~ "Built-up")) %>%
  mutate(landcover = factor(landcover))

# View the first 6 rows
head(dt)
##    ID B_median B_mean B_stdDev G_median G_mean G_stdDev R_median R_mean
## 1:  1      389    389        0      727    727        0      671    671
## 2:  2      267    267        0      607    607        0      421    421
## 3:  3      271    271        0      619    619        0      424    424
## 4:  4      264    264        0      628    628        0      431    431
## 5:  5      278    278       10      643    643        4      452    452
## 6:  6      261    261        0      619    619        0      421    421
##    R_stdDev NIR_median NIR_mean NIR_stdDev SWIR1_median SWIR1_mean SWIR1_stdDev
## 1:        0       3166     3166          0         2323       2323            0
## 2:        0       3585     3585          0         1734       1734            0
## 3:        0       3701     3701          0         1738       1738            0
## 4:        0       3670     3670          0         1736       1736            0
## 5:        3       3498     3498        129         1643       1643           73
## 6:        0       3657     3657          0         1672       1672            0
##    SWIR2_median SWIR2_mean SWIR2_stdDev BSI_median BSI_mean BSI_stdDev
## 1:         1375       1375            0       -856     -856          0
## 2:          762        762            0      -2824    -2824          0
## 3:          756        756            0      -2951    -2951          0
## 4:          762        762            0      -2896    -2896          0
## 5:          759        759           13      -2864    -2864         22
## 6:          724        724            0      -3036    -3036          0
##    NDVI_median NDVI_mean NDVI_stdDev EVI2_median EVI2_mean EVI2_stdDev
## 1:        6501      6501           0        4327      4327           0
## 2:        7898      7898           0        5421      5421           0
## 3:        7944      7944           0        5567      5567           0
## 4:        7897      7897           0        5512      5512           0
## 5:        7708      7708          61        5238      5238         167
## 6:        7934      7934           0        5516      5516           0
##    SAVI_median SAVI_mean SAVI_stdDev MSAVI_median MSAVI_mean MSAVI_stdDev
## 1:        4235      4235           0         4069       4069            0
## 2:        5270      5270           0         5356       5356            0
## 3:        5386      5386           0         5512       5512            0
## 4:        5337      5337           0         5446       5446            0
## 5:        5102      5102         136         5134       5134          178
## 6:        5346      5346           0         5459       5459            0
##    NDMI_median NDMI_mean NDMI_stdDev NBR_median NBR_mean NBR_stdDev NBR2_median
## 1:        1536      1536           0       3944     3944          0        2563
## 2:        3479      3479           0       6494     6494          0        3895
## 3:        3609      3609           0       6607     6607          0        3937
## 4:        3578      3578           0       6560     6560          0        3896
## 5:        3610      3610          33       6430     6430        158        3673
## 6:        3725      3725           0       6696     6696          0        3958
##    NBR2_mean NBR2_stdDev NDWI_median NDWI_mean NDWI_stdDev class_id
## 1:      2563           0       -6267     -6267           0        3
## 2:      3895           0       -7104     -7104           0        3
## 3:      3937           0       -7133     -7133           0        3
## 4:      3896           0       -7077     -7077           0        3
## 5:      3673         265       -6890     -6890          82        3
## 6:      3958           0       -7105     -7105           0        3
##            landcover
## 1: Coastal Shrubland
## 2: Coastal Shrubland
## 3: Coastal Shrubland
## 4: Coastal Shrubland
## 5: Coastal Shrubland
## 6: Coastal Shrubland

Histograms of predictors

dt %>%
  select(-c("class_id","ID","landcover")) %>%
  select(ends_with("median")) %>%
  melt(measure.vars = names(.)) %>%
  ggplot() +
  geom_histogram(aes(value)) +
  geom_vline(xintercept = 0, color = "gray70") +
  facet_wrap(facets = vars(variable), ncol = 4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dt %>%
  select(-c("class_id","ID","landcover")) %>%
  select(ends_with("mean")) %>%
  melt(measure.vars = names(.)) %>%
  ggplot() +
  geom_histogram(aes(value)) +
  geom_vline(xintercept = 0, color = "gray70") +
  facet_wrap(facets = vars(variable), ncol = 4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dt %>%
  select(-c("class_id","ID","landcover")) %>%
  select(ends_with("stdDev")) %>%
  melt(measure.vars = names(.)) %>%
  ggplot() +
  geom_histogram(aes(value)) +
  geom_vline(xintercept = 0, color = "gray70") +
  facet_wrap(facets = vars(variable), ncol = 4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Split into train and test data

set.seed(321)
# A stratified random split of the data
idx_train <- createDataPartition(dt$class_id,
                                 p = 0.7, # percentage of data as training
                                 list = FALSE)
train <- dt[idx_train]
test <- dt[-idx_train]

train <- train %>% select(-c("ID","landcover"))
levels(train$class_id) <- classes

test <- test %>% select(-c("ID","landcover"))
levels(test$class_id) <- classes

table(train$class_id)
## 
## Evergreen.Forest       Dry.Forest Costal.Shrubland Inland.Shrubland 
##             1592              644             1157              633 
##      Bare.Ground      Agriculture            Water         Built.up 
##              493              622               84              326
table(test$class_id)
## 
## Evergreen.Forest       Dry.Forest Costal.Shrubland Inland.Shrubland 
##              681              276              495              271 
##      Bare.Ground      Agriculture            Water         Built.up 
##              210              266               36              139

Classification

Random Forest

Fit model

Create cross-validation folds (splits the data into n random groups)

n_folds <- 10
set.seed(321)
folds <- createFolds(1:nrow(train), k = n_folds)
# Set the seed at each resampling iteration. Useful when running CV in parallel.
seeds <- vector(mode = "list", length = n_folds + 1) # +1 for the final model
for(i in 1:n_folds) seeds[[i]] <- sample.int(1000, n_folds)
seeds[n_folds + 1] <- sample.int(1000, 1) # seed for the final model

Set trainControl parameters

ctrl <- trainControl(summaryFunction = multiClassSummary,
                     method = "cv",
                     number = n_folds,
                     search = "grid",
                     classProbs = TRUE, # not implemented for SVM; will just get a warning
                     savePredictions = TRUE,
                     index = folds,
                     seeds = seeds)

Train the classifier

# Register a doParallel cluster, using 3/4 (75%) of total CPU-s
cl <- makeCluster(3/4 * detectCores())
registerDoParallel(cl)
model_rf <- caret::train(class_id ~ . , method = "rf", data = train,
                         importance = TRUE, # passed to randomForest()
                         # run CV process in parallel;
                         # see https://stackoverflow.com/a/44774591/5193830
                         allowParallel = TRUE,
                         tuneGrid = data.frame(mtry = c(2, 3, 4, 5, 8)),
                         trControl = ctrl)
stopCluster(cl); remove(cl)
# Unregister the doParallel cluster so that we can use sequential operations
# if needed; details at https://stackoverflow.com/a/25110203/5193830
registerDoSEQ()
saveRDS(model_rf, file = "D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/processed/Classification/R/initial_model_rf.rds")

Model summary & confusion matrix

model_rf$times$everything # total computation time
##        User      System verstrichen 
##       50.09        0.93      156.55
plot(model_rf, uniform=TRUE, main="Random forest")

#ggplot(model_rf) # same as above, but using the ggplot method from caret

Confusion matrix

(cm_rf <- confusionMatrix(data = predict(model_rf, newdata = test),
                         test$class_id))
## Confusion Matrix and Statistics
## 
##                   Reference
## Prediction         Evergreen.Forest Dry.Forest Costal.Shrubland
##   Evergreen.Forest              679          0                2
##   Dry.Forest                      0        266                0
##   Costal.Shrubland                1          0              491
##   Inland.Shrubland                1          4                1
##   Bare.Ground                     0          0                0
##   Agriculture                     0          6                1
##   Water                           0          0                0
##   Built.up                        0          0                0
##                   Reference
## Prediction         Inland.Shrubland Bare.Ground Agriculture Water Built.up
##   Evergreen.Forest                0           0           0     1        0
##   Dry.Forest                      4          11          19     0        5
##   Costal.Shrubland                7           0           0     1        0
##   Inland.Shrubland              259           1           0     0        0
##   Bare.Ground                     0         192           2     0        4
##   Agriculture                     1           4         240     1        9
##   Water                           0           0           0    33        0
##   Built.up                        0           2           5     0      121
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9608          
##                  95% CI : (0.9522, 0.9683)
##     No Information Rate : 0.2869          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9524          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Evergreen.Forest Class: Dry.Forest
## Sensitivity                           0.9971            0.9638
## Specificity                           0.9982            0.9814
## Pos Pred Value                        0.9956            0.8721
## Neg Pred Value                        0.9988            0.9952
## Prevalence                            0.2869            0.1163
## Detection Rate                        0.2860            0.1120
## Detection Prevalence                  0.2873            0.1285
## Balanced Accuracy                     0.9976            0.9726
##                      Class: Costal.Shrubland Class: Inland.Shrubland
## Sensitivity                           0.9919                  0.9557
## Specificity                           0.9952                  0.9967
## Pos Pred Value                        0.9820                  0.9737
## Neg Pred Value                        0.9979                  0.9943
## Prevalence                            0.2085                  0.1142
## Detection Rate                        0.2068                  0.1091
## Detection Prevalence                  0.2106                  0.1120
## Balanced Accuracy                     0.9936                  0.9762
##                      Class: Bare.Ground Class: Agriculture Class: Water
## Sensitivity                     0.91429             0.9023      0.91667
## Specificity                     0.99723             0.9896      1.00000
## Pos Pred Value                  0.96970             0.9160      1.00000
## Neg Pred Value                  0.99173             0.9877      0.99872
## Prevalence                      0.08846             0.1120      0.01516
## Detection Rate                  0.08088             0.1011      0.01390
## Detection Prevalence            0.08340             0.1104      0.01390
## Balanced Accuracy               0.95576             0.9459      0.95833
##                      Class: Built.up
## Sensitivity                  0.87050
## Specificity                  0.99687
## Pos Pred Value               0.94531
## Neg Pred Value               0.99199
## Prevalence                   0.05855
## Detection Rate               0.05097
## Detection Prevalence         0.05392
## Balanced Accuracy            0.93369
model_rf$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = min(param$mtry, ncol(x)), importance = TRUE,      allowParallel = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 4.18%
## Confusion matrix:
##                  Evergreen.Forest Dry.Forest Costal.Shrubland Inland.Shrubland
## Evergreen.Forest             1584          0                6                1
## Dry.Forest                      0        624                0                4
## Costal.Shrubland                7          1             1142                6
## Inland.Shrubland                0          5               14              611
## Bare.Ground                     0         23                1                2
## Agriculture                     0         48                3                2
## Water                           0          0                1                0
## Built.up                        0          9                1                3
##                  Bare.Ground Agriculture Water Built.up class.error
## Evergreen.Forest           0           1     0        0 0.005025126
## Dry.Forest                 4          12     0        0 0.031055901
## Costal.Shrubland           0           0     1        0 0.012964564
## Inland.Shrubland           0           3     0        0 0.034755134
## Bare.Ground              456          10     0        1 0.075050710
## Agriculture                4         556     0        9 0.106109325
## Water                      1           4    76        2 0.095238095
## Built.up                  22          21     0      270 0.171779141

Predictor importance

caret::varImp(model_rf)$importance %>% 
  as.matrix %>% 
  plot_ly(x = colnames(.), y = rownames(.), z = ., type = "heatmap",
          width = 600, height = 600)
randomForest::importance(model_rf$finalModel) %>% 
  .[, - which(colnames(.) %in% c("MeanDecreaseAccuracy", "MeanDecreaseGini"))] %>% 
  plot_ly(x = colnames(.), y = rownames(.), z = ., type = "heatmap",
          width = 600, height = 600)
randomForest::varImpPlot(model_rf$finalModel)

Visualize

Class probabilities

Make prediction

system.time({
  predict_rf_2 <- raster::predict(object = landsat,
                                model = model_rf, 
                                type = 'prob' , index=1:8)
  # predict_svm <- raster::predict(object = landsat,
  #                                model = model_svm, type = 'raw')
  # predict_nnet <- raster::predict(object = landsat,
  #                                 model = model_nnet, type = 'raw')
})
##        User      System verstrichen 
##      244.27       10.65      312.08

Plot class probabilies

#viewRGB(brick(landsat[1:3]), r = 3, g = 2, b = 1) +
#mapView(predict_rf
        #,col.regions = list("#c22341","#f86b05","#020103","#5fa00a","#a1a733","#162700","#37590a","#0500ff")
 #       )
     # ,
     # mapView(predict_svm, col.regions = cls_dt$hex),
     # mapView(predict_nnet, col.regions = cls_dt$hex)
#plot(predict_rf_2)
levelplot(predict_rf_2)

Expand below to see single class probability for each class.
Click here to expand!
for (i in 1:length(classdf$classvalue)){
mycol <- colorRampPalette(c("white", classdf$color[i]))
print(levelplot(predict_rf_2[[i]], col.regions=mycol, main=classdf$classnames[i]))
}

Combine layers to create final single layer product (–> same as discrete class layer below)

# # Option 1 with terra package
# predict_rf_2_comb <- terra::app(terra::rast(predict_rf_2)#, indices = c(1,2,3,4,5,6,7,8)
#             , fun = which.max)
# str(predict_rf_2_comb)
# #levels(predict_rf_2_comb) <- as.character(classdf$classnames)
# levelplot(predict_rf_2_comb)

# Option 2 with raster package
which.max2 <- function(x, ...) which.max(x)           # helper function to absorb "na.rm"
predict_rf_2_comb <-  stackApply(predict_rf_2, rep(1,8), fun=which.max2, na.rm=NULL)

#predict_rf_2_comb <- as.factor(predict_rf_2_comb)
#levels(predict_rf_2_comb) <- as.character(classdf$classnames)

#predict_rf_2_comb@data@names <- c("1 = Evergreen Forest \n2 = Dry Forest \n3 = Coastal Shrubland \n4 = Inland Shrubland \n5 = Bare Ground \n6 = Agriculture \n7 = Water \n8 = Built-up")

#plot.new()  # open new plot
levelplot(predict_rf_2_comb, main = "Class probabilities") 

#+ layer(panel.text(predict_rf_2_comb@extent@xmin, predict_rf_2_comb@extent@ymin, text2add))
#mtext("1 = Evergreen Forest \n2 = Dry Forest \n3 = Coastal Shrubland \n4 = Inland Shrubland \n5 = Bare Ground \n6 = Agriculture \n7 = Water \n8 = Built-up", 1, line=0, adj=0, cex = 0.7)

class_df <- classdf[1:2]
data.table(class_df)
##    classvalue       classnames
## 1:          1 Evergreen.Forest
## 2:          2       Dry.Forest
## 3:          3 Costal.Shrubland
## 4:          4 Inland.Shrubland
## 5:          5      Bare.Ground
## 6:          6      Agriculture
## 7:          7            Water
## 8:          8         Built.up

Discrete classes

Make prediction

system.time({
  predict_rf <- raster::predict(object = landsat,
                                model = model_rf, 
                                type = 'raw')
  # predict_svm <- raster::predict(object = landsat,
  #                                model = model_svm, type = 'raw')
  # predict_nnet <- raster::predict(object = landsat,
  #                                 model = model_nnet, type = 'raw')
})
##        User      System verstrichen 
##      214.72        7.49      238.11

Plot classes (–> same as combined probability layer above)

#plot
levelplot(predict_rf,col.regions=cpal,att='value')

Export

# # Export probability rasters for each class
# for (i in 1:length(predict_rf_2)){
#   raster::writeRaster(predict_rf_2[[i]], paste0( "D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/rf_",i,"class_prob.tif"))
# }
# 
# # Export combined/single raster classification
# raster::writeRaster(predict_rf_2_comb, "D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/rf_prob_comb.tif")
# raster::writeRaster(predict_rf, "D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/rf_classes.tif")